Udacity - Exploratory Data Analysis workbook

8.7.18

To begin we will follow the example below. Look carefully at the syntax

library(ggplot2)
library(RColorBrewer)
data(diamonds)
qplot(data=diamonds, x=carat,y=price,color=cut) + scale_color_brewer(palette = 'Accent')

getwd()
## [1] "/Users/brad/Documents/EDA_Data"
setwd('~/Documents/EDA_Data')
statesInfo<-read.csv('stateData.csv')
subset(statesInfo, state.region==1)
##                X state.abb state.area state.region population income
## 7    Connecticut        CT       5009            1       3100   5348
## 19         Maine        ME      33215            1       1058   3694
## 21 Massachusetts        MA       8257            1       5814   4755
## 29 New Hampshire        NH       9304            1        812   4281
## 30    New Jersey        NJ       7836            1       7333   5237
## 32      New York        NY      49576            1      18076   4903
## 38  Pennsylvania        PA      45333            1      11860   4449
## 39  Rhode Island        RI       1214            1        931   4558
## 45       Vermont        VT       9609            1        472   3907
##    illiteracy life.exp murder highSchoolGrad frost  area
## 7         1.1    72.48    3.1           56.0   139  4862
## 19        0.7    70.39    2.7           54.7   161 30920
## 21        1.1    71.83    3.3           58.5   103  7826
## 29        0.7    71.23    3.3           57.6   174  9027
## 30        1.1    70.93    5.2           52.5   115  7521
## 32        1.4    70.55   10.9           52.7    82 47831
## 38        1.0    70.43    6.1           50.2   126 44966
## 39        1.3    71.90    2.4           46.4   127  1049
## 45        0.6    71.64    5.5           57.1   168  9267

For the syntax above we have the following relationship: dataSet[Row condition, column condition]

If left blank, yields all columns

statesInfo[statesInfo$state.region==1,]
##                X state.abb state.area state.region population income
## 7    Connecticut        CT       5009            1       3100   5348
## 19         Maine        ME      33215            1       1058   3694
## 21 Massachusetts        MA       8257            1       5814   4755
## 29 New Hampshire        NH       9304            1        812   4281
## 30    New Jersey        NJ       7836            1       7333   5237
## 32      New York        NY      49576            1      18076   4903
## 38  Pennsylvania        PA      45333            1      11860   4449
## 39  Rhode Island        RI       1214            1        931   4558
## 45       Vermont        VT       9609            1        472   3907
##    illiteracy life.exp murder highSchoolGrad frost  area
## 7         1.1    72.48    3.1           56.0   139  4862
## 19        0.7    70.39    2.7           54.7   161 30920
## 21        1.1    71.83    3.3           58.5   103  7826
## 29        0.7    71.23    3.3           57.6   174  9027
## 30        1.1    70.93    5.2           52.5   115  7521
## 32        1.4    70.55   10.9           52.7    82 47831
## 38        1.0    70.43    6.1           50.2   126 44966
## 39        1.3    71.90    2.4           46.4   127  1049
## 45        0.6    71.64    5.5           57.1   168  9267

Now to find subsets for illerteracy, highschool graduation rate from the statesInfo data set we compute the following

illiteracybracket<-statesInfo[statesInfo$illiteracy==0.5,]
highschoolgradbracket <- statesInfo[statesInfo$highSchoolGrad>50,]
qplot(data=statesInfo, x=state.region, y=illiteracy, color=statesInfo$highSchoolGrad) 

We now move onto the following data set # REDDIT DATA SET

reddit<-read.csv("reddit.csv")
str(reddit)
## 'data.frame':    32754 obs. of  14 variables:
##  $ id               : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ gender           : int  0 0 1 0 1 0 0 0 0 0 ...
##  $ age.range        : Factor w/ 7 levels "18-24","25-34",..: 2 2 1 2 2 2 2 1 3 2 ...
##  $ marital.status   : Factor w/ 6 levels "Engaged","Forever Alone",..: NA NA NA NA NA 4 3 4 4 3 ...
##  $ employment.status: Factor w/ 6 levels "Employed full time",..: 1 1 2 2 1 1 1 4 1 2 ...
##  $ military.service : Factor w/ 2 levels "No","Yes": NA NA NA NA NA 1 1 1 1 1 ...
##  $ children         : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ education        : Factor w/ 7 levels "Associate degree",..: 2 2 5 2 2 2 5 2 2 5 ...
##  $ country          : Factor w/ 439 levels " Canada"," Canada eh",..: 394 394 394 394 394 394 125 394 394 125 ...
##  $ state            : Factor w/ 53 levels "","Alabama","Alaska",..: 33 33 48 33 6 33 1 6 33 1 ...
##  $ income.range     : Factor w/ 8 levels "$100,000 - $149,999",..: 2 2 8 2 7 2 NA 7 2 7 ...
##  $ fav.reddit       : Factor w/ 1834 levels "","___","-","?",..: 720 691 1511 1528 188 691 1318 571 1629 1 ...
##  $ dog.cat          : Factor w/ 3 levels "I like cats.",..: NA NA NA NA NA 2 2 2 1 1 ...
##  $ cheese           : Factor w/ 11 levels "American","Brie",..: NA NA NA NA NA 3 3 1 10 7 ...

We call a Factor a variable which contains some categorical information, such as gender, age, etc. For example we can determine employment status as follows

table(reddit$employment.status)
## 
##                    Employed full time 
##                                 14814 
##                             Freelance 
##                                  1948 
## Not employed and not looking for work 
##                                   682 
##    Not employed, but looking for work 
##                                  2087 
##                               Retired 
##                                    85 
##                               Student 
##                                 12987

The above yields a list of emplyomeny status entries.

summary(reddit$employment.status)
##                    Employed full time 
##                                 14814 
##                             Freelance 
##                                  1948 
## Not employed and not looking for work 
##                                   682 
##    Not employed, but looking for work 
##                                  2087 
##                               Retired 
##                                    85 
##                               Student 
##                                 12987 
##                                  NA's 
##                                   151
levels(reddit$age.range)
## [1] "18-24"       "25-34"       "35-44"       "45-54"       "55-64"      
## [6] "65 or Above" "Under 18"
# notice 7 different ranges
qplot(reddit$age.range)

qplot(data=reddit, x=income.range)

We can rearrange the levels so that the graph is easier to read/more cogent.

reddit$age.range<-ordered(reddit$age.range, levels=c('Under 18', '18-24', '25-34', '35-44', '45-54', '55-64','65 or Above', 'NA'))
qplot(data=reddit, x=age.range)

Now the data is rearranged as required.

Alternatively, we can rearrange the data as follows

reddit$age.range <- factor(reddit$age.range, levels=c('Under 18', '18-24', '25-34', '35-44', '45-54', '55-64','65 or Above', 'NA'), ordered = T)

Similarly, we can do the same for the income brackets.

reddit$income.range<-ordered(reddit$age.range, levels=c("Under $20,000", '$20,000 - $29,999', '$30,000 - $39,999', '$40,000 - $49,999', '$50,000 - $69,999', '$70,000 - $99,999', '$100,000 - $149,999', '$150,000 or more', 'NA'))
qplot(data=reddit, x=income.range)

#Lesson 3 - exploring one variable First I downloaded the pseudo-facebook data, which is found in the EDA folder We can see the current working directory with the following command

getwd()
## [1] "/Users/brad/Documents/EDA_Data"

To change the directory we use the following command

setwd('~/Documents/EDA_Data')
list.files()
##  [1] "energy.csv"                                  
##  [2] "energy.xlsx"                                 
##  [3] "GDP.csv"                                     
##  [4] "GDP.xlsx"                                    
##  [5] "HIV.csv"                                     
##  [6] "HIV.xlsx"                                    
##  [7] "indicator alcohol consumption  20100830.xlsx"
##  [8] "indicator_t above 15 unemploy (1).xlsx"      
##  [9] "indicator_t above 15 unemploy.xlsx"          
## [10] "pseudo_facebook.tsv"                         
## [11] "reddit.csv"                                  
## [12] "stateData.csv"                               
## [13] "suicide.csv"                                 
## [14] "suicide.xlsx"                                
## [15] "udacity_workbook_files"                      
## [16] "udacity_workbook.html"                       
## [17] "udacity_workbook.R"                          
## [18] "udacity_workbook.Rmd"                        
## [19] "unemployment.csv"
pf<-read.csv('pseudo_facebook.tsv', sep='\t')
names(pf)
##  [1] "userid"                "age"                  
##  [3] "dob_day"               "dob_year"             
##  [5] "dob_month"             "gender"               
##  [7] "tenure"                "friend_count"         
##  [9] "friendships_initiated" "likes"                
## [11] "likes_received"        "mobile_likes"         
## [13] "mobile_likes_received" "www_likes"            
## [15] "www_likes_received"

Notice that the pseudo_facebook data is actually a TAB separated file (.tsv). Therefore, we must use the following command.

This data set was created by Udacity, so the stats may not align.

To begin the analysis, we start by looking at birth dates of pseudo-facebook users.

library(ggplot2)
#install.packages('ggthemes', dependencies = TRUE)
library(ggthemes)
names(pf)
##  [1] "userid"                "age"                  
##  [3] "dob_day"               "dob_year"             
##  [5] "dob_month"             "gender"               
##  [7] "tenure"                "friend_count"         
##  [9] "friendships_initiated" "likes"                
## [11] "likes_received"        "mobile_likes"         
## [13] "mobile_likes_received" "www_likes"            
## [15] "www_likes_received"

A histogram of days of the month that individuals were born on is given by the command below

ggplot(aes(x = dob_day), data = pf) +
  geom_histogram(binwidth = 1) +
  scale_x_continuous(breaks = 1:31)

To motivate what we are actually going ot do with the data consider the following questions: ##Q: Do users perception of their audience actually match their audience. This is an interesting question because it depends on how a user presents themselve. The survey asked: “How many people do you think saw this post?” As the researcher tells us, the gap betweeen expectation and reality was apparent, as people actually underestimated audience size.

Now we begin to move into a discussion of ggplot While using the ggplot enviorment, the + sign indicates a layer of a plot or graph. We can add a variety of layers, such as geom_point() for scatter plots, geom_line for line plots, or nothing for a histogram.

Now we add facet_wrap function to break up the data into months. Also, note that the aes is necessary for the ggplot functions.

ggplot(aes(x = dob_day), data = pf) +
  geom_histogram(binwidth = 1) +
  scale_x_continuous(breaks = 1:31) +
  facet_wrap(~dob_month,ncol=4)

Notice that changing ncol=n, yields a different number of columns, i.e. 3 or 4, etc. Syntax facet_wrap(~variable)

Handling outliers

The following code will produce a histogram of the number of the friend count.

qplot(x=friend_count, data=pf) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Furthermore, we can limit the range of the x axis with the following code

qplot(x=friend_count, data=pf, xlim=c(0,1000)) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2951 rows containing non-finite values (stat_bin).

Or rather

qplot(x=friend_count, data=pf) + 
  scale_x_continuous(limit=c(0,1000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2951 rows containing non-finite values (stat_bin).

Notice the bin width error produced. #adding the following command then yields

qplot(x=friend_count, data=pf, binwidth=25) + 
  scale_x_continuous(limit=c(0,1000), breaks=seq(0,1000,50)) 
## Warning: Removed 2951 rows containing non-finite values (stat_bin).

This makes the distribution a bit easier to read.

Data question: Which gender tends to have more friends?

A: We can see this by computing the following data.

qplot(x=friend_count, data=pf, binwidth=25) + 
  scale_x_continuous(limit=c(0,1000), breaks=seq(0,1000,50)) +
  facet_grid(~gender)
## Warning: Removed 2951 rows containing non-finite values (stat_bin).

In this example, gender is the splitting variable in the facet_grid.

Notice in the above that we got three columns, since there was some missing data which R interprets as NA. To look at just the two genders, we can run the following code

ggplot(aes(x = friend_count), data = subset(pf, !is.na(gender))) +
  geom_histogram() +
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
  facet_wrap(~gender)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2949 rows containing non-finite values (stat_bin).

Note we caould us omit.na(), too, but it is a bit more problematic since it could remove more than we intended.

Now we want to determine who has more friends on average. To see this run the table command to see if there are more or less men v women.

table(pf$gender)
## 
## female   male 
##  40254  58574

This table tells us that there are female 40254 and male 58574 Moreover, the function below runs the summary function

by(pf$friend_count,pf$gender,summary)
## pf$gender: female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      37      96     242     244    4923 
## -------------------------------------------------------- 
## pf$gender: male
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      27      74     165     182    4917

The function above applies a function to both the first and second

NOW WE WILL ADD COLORS!!!!!

To begin, consider the following plot

ggplot(aes(x = tenure), data = pf) +
  geom_histogram(binwidth = 30, color = 'black', fill = '#099DD9')
## Warning: Removed 2 rows containing non-finite values (stat_bin).

Where tenure is the number of days that a person has been a member of pseudo-facebook

Rescaling the tenure to years we obtain the following plot

qplot(x=tenure/365, data=pf, binwidth=1/12, color = I('black'), fill = I('#099DD9'), ylab='tenure', xlab='years')+ 
  scale_x_continuous(seq(1,7,1), limits = c(0,7))
## Warning: Removed 26 rows containing non-finite values (stat_bin).

Further adjusting the data we obtain

qplot(x=age, data=pf, binwidth=1, color = I('black'), fill = I('#099DD9')) +
  scale_x_continuous(seq(5,110,5), limits = c(5,110))
## Warning: Removed 238 rows containing non-finite values (stat_bin).

Transforming DATA

Sometimes the data we obtain is very dispersed, i.e. tail heavy.

To see this consider the log transform of the following data. Since the order of magnitude is 10^n, it makes sense to take the log to collect similar data points.

The motivation for doing this is so we can get a normally distributed data set. If the resulting data is normally distributed then we have more statistical tools at out disposal.

qplot(x=friend_count, data=pf)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary(pf$friend_count)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    31.0    82.0   196.4   206.0  4923.0
summary(log10(pf$friend_count+1))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.505   1.919   1.868   2.316   3.692

Notice that we added 1 so that the log is well defined

We can also use other transformations, such as the sqrt, or natural log

summary(sqrt(pf$friend_count))      
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   5.568   9.055  11.088  14.353  70.164

Now try to add all three plots to a single page. To do this, first install the following page called gridExtra, and store the individual plots as follows.

#install.packages('gridExtra')
library(gridExtra)
p1<-qplot(x=friend_count, data=pf)
p2<-qplot(x=log10(friend_count+1), data=pf)
p3<-qplot(x=sqrt(friend_count),data=pf)
grid.arrange(p1,p2,p3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Alternatively, we can plot this as follows

p1<-ggplot(aes(x=friend_count), data=pf) + geom_histogram()
p2<-p1+scale_x_log10()
p3<-p1+scale_x_sqrt()
grid.arrange(p1,p2,p3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1962 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Difference between qplot and ggplot

There are some subtle difference in how qplot and ggplot work, so we will compare below.

logScale<-qplot(x=log10(friend_count+1), data=pf)
countScale<-ggplot(aes(x=friend_count), data=pf) + geom_histogram() +scale_x_log10()
grid.arrange(logScale,countScale, ncol = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1962 rows containing non-finite values (stat_bin).

FREQUENCY POLYGON

Now we will start constructing frequency polygons. In qplot the syntax is:

qplot(x=friend_count, data= subset(pf,!is.na(gender)), binwidth=10, geom='freqpoly', color=gender) +
  scale_x_continuous(lim=c(0,1000), breaks=seq(0,1000,50))
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_path).

Notice that we added geom and color functions.

In ggplot the syntax is

ggplot(aes(x = friend_count, y = ..count../sum(..count..)),
       data = subset(pf, !is.na(gender))) +
  geom_freqpoly(aes(color = gender), binwidth=10) +
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
  xlab('Friend Count') +
  ylab('Proportion of users with that friend count')
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_path).

Next we will consider proportions of friend counts

qplot(x=friend_count, y=..count../sum(..count..),
      data= subset(pf,!is.na(gender)),
      xlab='Friend Count',
      ylab='Proportion of users w/ friend count',
      binwidth=10, geom='freqpoly', color=gender) +
  scale_x_continuous(lim=c(0,1000), breaks=seq(0,1000,50))
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_path).

Notice the y axis count is different

Note that we can change limits and breaks

Looking at www_likes

Consider the following plots

qplot(x=www_likes,
      data= subset(pf,!is.na(gender)),
      geom='freqpoly', color=gender) +
  scale_x_continuous()+
  scale_x_log10()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 60935 rows containing non-finite values (stat_bin).

We now numerically analyze the number of www_likes by gender as follows

by(pf$www_likes,pf$gender, sum)
## pf$gender: female
## [1] 3507665
## -------------------------------------------------------- 
## pf$gender: male
## [1] 1430175

BOX PLOTS

Consider the following freqpoly. Notice we subset the data to eliminate the na gender entries.

qplot(x=friend_count, data= subset(pf,!is.na(gender)), binwidth=10, geom='freqpoly', color=gender) +
  scale_x_continuous(lim=c(0,1000), breaks=seq(0,1000,50))
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_path).

qplot(geom='boxplot', data=subset(pf,!is.na(gender)),
      x=gender,
      y=log10(friend_count),
      ylim=c(0,1000))
## Warning: Removed 1962 rows containing non-finite values (stat_boxplot).

We now rescale the data so that the number of outliers is minimized, or we can use

qplot(geom='boxplot', data=subset(pf,!is.na(gender)),
      x=gender,
      y=friend_count)+
  scale_y_continuous(limits=c(0,1000))
## Warning: Removed 2949 rows containing non-finite values (stat_boxplot).

Or rather,

qplot(geom='boxplot', data=subset(pf,!is.na(gender)),
      x=gender,
      y=friend_count)+
  coord_cartesian(ylim=c(0,1000))

Observe: notice that the female box has moved closer to 250 in comparison to the other. We can further adjust this as follows:

qplot(geom='boxplot', data=subset(pf,!is.na(gender)),
      x=gender,
      y=friend_count)+
  coord_cartesian(ylim=c(0,250))

Now compute a summary

by(pf$friend_count,pf$gender,summary)
## pf$gender: female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      37      96     242     244    4923 
## -------------------------------------------------------- 
## pf$gender: male
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      27      74     165     182    4917

Friendship Initiated

Now we begin analyzing friendships initated by gender. Consdier the following plots

qplot(x=gender, y=friendships_initiated,data=subset(pf,!is.na(gender)),geom='boxplot')+
  coord_cartesian(ylim=c(0,150))

We now compute a numerical summary.

by(pf$friendships_initiated,pf$gender,summary)
## pf$gender: female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    19.0    49.0   113.9   124.8  3654.0 
## -------------------------------------------------------- 
## pf$gender: male
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    15.0    44.0   103.1   111.0  4144.0

To check if people use mobile check is use booleans as follows

summary(pf$mobile_likes)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0     4.0   106.1    46.0 25111.0
summary(pf$mobile_likes>0)
##    Mode   FALSE    TRUE 
## logical   35056   63947
pf$mobile_check_in<-NA
pf$mobile_check_in<-ifelse(pf$mobile_likes>0,1,0)
pf$mobile_check_in<-factor(pf$mobile_check_in)
summary(pf$mobile_check_in)
##     0     1 
## 35056 63947
sum(pf$mobile_check_in==1)/length(pf$mobile_check_in)
## [1] 0.6459097

This tells us that mobile_check_in

QUIZ EXPLORE SINGLE VARIABLE

To begin we will load the necessary packages and library

library(ggplot2)
data(diamonds)
summary(diamonds)
##      carat               cut        color        clarity     
##  Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065  
##  1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258  
##  Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194  
##  Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171  
##  3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066  
##  Max.   :5.0100                     I: 5422   VVS1   : 3655  
##                                     J: 2808   (Other): 2531  
##      depth           table           price             x         
##  Min.   :43.00   Min.   :43.00   Min.   :  326   Min.   : 0.000  
##  1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710  
##  Median :61.80   Median :57.00   Median : 2401   Median : 5.700  
##  Mean   :61.75   Mean   :57.46   Mean   : 3933   Mean   : 5.731  
##  3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540  
##  Max.   :79.00   Max.   :95.00   Max.   :18823   Max.   :10.740  
##                                                                  
##        y                z         
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 4.720   1st Qu.: 2.910  
##  Median : 5.710   Median : 3.530  
##  Mean   : 5.735   Mean   : 3.539  
##  3rd Qu.: 6.540   3rd Qu.: 4.040  
##  Max.   :58.900   Max.   :31.800  
## 

Notice that three of the variables have a well defined order.

Summarizing the data we have

summary(diamonds$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     950    2401    3933    5324   18823
sum(ifelse(diamonds$price>=15000,1,0))
## [1] 1656
ggplot(aes(x = price), data = diamonds) +
  geom_histogram(binwidth = 20, color = 'black', fill = '#099DD9')+
  scale_x_continuous(limits=c(0,1500))
## Warning: Removed 33930 rows containing non-finite values (stat_bin).

Now we price by cut (this ought to be useful) I’m not sure what else I computed here, but reading through might make things more apparent.

ggplot(aes(x = price),data = diamonds) +
  geom_histogram() +
  scale_x_continuous() +
  facet_wrap(~cut, scales="free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

by(diamonds$price,diamonds$cut,summary)
## diamonds$cut: Fair
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     337    2050    3282    4359    5206   18574 
## -------------------------------------------------------- 
## diamonds$cut: Good
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     327    1145    3050    3929    5028   18788 
## -------------------------------------------------------- 
## diamonds$cut: Very Good
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     336     912    2648    3982    5373   18818 
## -------------------------------------------------------- 
## diamonds$cut: Premium
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326    1046    3185    4584    6296   18823 
## -------------------------------------------------------- 
## diamonds$cut: Ideal
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     326     878    1810    3458    4678   18806
summary(diamonds)
##      carat               cut        color        clarity     
##  Min.   :0.2000   Fair     : 1610   D: 6775   SI1    :13065  
##  1st Qu.:0.4000   Good     : 4906   E: 9797   VS2    :12258  
##  Median :0.7000   Very Good:12082   F: 9542   SI2    : 9194  
##  Mean   :0.7979   Premium  :13791   G:11292   VS1    : 8171  
##  3rd Qu.:1.0400   Ideal    :21551   H: 8304   VVS2   : 5066  
##  Max.   :5.0100                     I: 5422   VVS1   : 3655  
##                                     J: 2808   (Other): 2531  
##      depth           table           price             x         
##  Min.   :43.00   Min.   :43.00   Min.   :  326   Min.   : 0.000  
##  1st Qu.:61.00   1st Qu.:56.00   1st Qu.:  950   1st Qu.: 4.710  
##  Median :61.80   Median :57.00   Median : 2401   Median : 5.700  
##  Mean   :61.75   Mean   :57.46   Mean   : 3933   Mean   : 5.731  
##  3rd Qu.:62.50   3rd Qu.:59.00   3rd Qu.: 5324   3rd Qu.: 6.540  
##  Max.   :79.00   Max.   :95.00   Max.   :18823   Max.   :10.740  
##                                                                  
##        y                z         
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 4.720   1st Qu.: 2.910  
##  Median : 5.710   Median : 3.530  
##  Mean   : 5.735   Mean   : 3.539  
##  3rd Qu.: 6.540   3rd Qu.: 4.040  
##  Max.   :58.900   Max.   :31.800  
## 
ggplot(aes(x = log10(price/carat)),data = diamonds) +
  geom_histogram() +
  scale_x_continuous() +
  facet_wrap(~cut, scales="free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(geom='boxplot', data=diamonds,
      x=color,
      y=price/carat) +
  coord_cartesian( ylim=c(0,7000))

by(diamonds$price,diamonds$color,IQR)
## diamonds$color: D
## [1] 3302.5
## -------------------------------------------------------- 
## diamonds$color: E
## [1] 3121
## -------------------------------------------------------- 
## diamonds$color: F
## [1] 3886.25
## -------------------------------------------------------- 
## diamonds$color: G
## [1] 5117
## -------------------------------------------------------- 
## diamonds$color: H
## [1] 4996.25
## -------------------------------------------------------- 
## diamonds$color: I
## [1] 6081.25
## -------------------------------------------------------- 
## diamonds$color: J
## [1] 5834.5
?diamonds


qplot(x=price, data=diamonds, binwidth=40, geom='freqpoly', color=color)

qplot(data=diamonds,x=carat, geom = 'freqpoly',binwidth=0.1, color=carat)+
  scale_x_continuous(breaks = seq(0,3,0.1),lim= c(0,3))+
  scale_y_continuous(breaks = seq(2000,10000,1000))
## Warning: Removed 32 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_path).

I’m not sure what I did above!

With the diamonds dataset questions complete, I will now begin analyzing the unemployment dataset I found on mindgapper or whatever database. Switching the working directory again, for sanity

setwd('~/Documents/EDA_Data')
library(tidyr)
gdp <- read.csv("GDP.csv", header=T, check.names = F)

gdp.T <- gather(data=gdp, key='Year', value='GDP', '1960':'2011',convert = TRUE)
range(gdp.T$Year)
## [1] 1960 2011
hiv <- read.csv("HIV.csv", header=T, check.names = F)
hiv.T <- gather(data=hiv, key='Year', value='HIV_prev', '1979':'2011', convert = TRUE)
hiv.T$HIV_prev <- as.numeric(hiv.T$HIV_prev)

summary(hiv.T)
##   Estimated HIV Prevalence% - (Ages 15-49)      Year         HIV_prev     
##  Abkhazia             :  33                Min.   :1979   Min.   : 0.010  
##  Afghanistan          :  33                1st Qu.:1987   1st Qu.: 0.100  
##  Akrotiri and Dhekelia:  33                Median :1995   Median : 0.300  
##  Åland                :  33                Mean   :1995   Mean   : 1.743  
##  Albania              :  33                3rd Qu.:2003   3rd Qu.: 1.200  
##  Algeria              :  33                Max.   :2011   Max.   :26.500  
##  (Other)              :8877                               NA's   :5774
gdp.HIV <- merge(gdp.T, hiv.T)
str(gdp.HIV)
## 'data.frame':    2495625 obs. of  5 variables:
##  $ Year                                    : int  1979 1979 1979 1979 1979 1979 1979 1979 1979 1979 ...
##  $ Income per person (fixed 2000 US$)      : Factor w/ 275 levels "Abkhazia","Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ GDP                                     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Estimated HIV Prevalence% - (Ages 15-49): Factor w/ 275 levels "Abkhazia","Afghanistan",..: 1 2 3 5 6 7 8 9 10 12 ...
##  $ HIV_prev                                : num  NA NA NA NA NA ...
summary((is.na(gdp.HIV$GDP)) / nrow(gdp.HIV) * 100)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.000e+00 0.000e+00 0.000e+00 1.415e-05 4.007e-05 4.007e-05
summary(gdp.HIV$GDP)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##     54.5    590.1   2038.9   7315.1   9243.6 108111.2   881100
summary(gdp.HIV$HIV_prev)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0     0.1     0.3     1.7     1.2    26.5 1587850
qplot(y=gdp.HIV$`Income per person (fixed 2000 US$)`,x=gdp.HIV$GDP, data=gdp.HIV, xlim = c(0,4500))
## Warning: Removed 1458325 rows containing missing values (geom_point).

The goal here is to analyze how a country’s GDP and HIV prevalence are related

SECTION 5

Loading the relevant library again.

library(ggplot2)
pf<-read.csv('pseudo_facebook.tsv', sep='\t')

I read in the pseudo_facebook for sanity, too.

Exploring the relationship between two cont. variables.

The code below creates a scatterplot of age (years) vs friend_count

qplot(x=age,y=friend_count,data=pf)

Or identically

qplot(age,friend_count,data=pf)

Several obervations: peaks in the 0-30 range, several peaks ~65-70 (at 69 to be specific) and 100<

From here on we will switch to ggplot syntax, allowing for more robust plots.

ggplot(aes(x=age, y=friend_count),data=pf)+geom_point()+xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).

Question: computationally, is one of these faster than the other?

Several attributes of ggplot geom=chart type. look at reference. aes wrapper - aesthetic wrapper. The summary of age variable of pf dataset

summary(pf$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   20.00   28.00   37.28   50.00  113.00

Note that we added the extra x lim layer since we want to cut the data down a little bit. Overplotting= makes it difficult to see how many points are in a certain area. Using the alpha proerty of geom_point() we can adjust the transparency. 1/20 means it takes 20 points to appear as one appoint.

ggplot(aes(x=age, y=friend_count),data=pf)+
  geom_point(alpha=1/20)+
  xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).

Now we add jitter, which adds noise to a variable of the data set. In this case, we will add jitter to age(?).

ggplot(aes(x=age, y=friend_count),data=pf)+
  geom_jitter(alpha=1/20)+
  xlim(13,90)
## Warning: Removed 5189 rows containing missing values (geom_point).

Adding jitter makes sense since we were given that age as an integer, when in reality it ought to be a continous variable.

Now we add a transformation to the y axis, so that we may change the friend count scale.

ggplot(aes(x=age, y=friend_count),data=pf)+
  geom_point(alpha=1/20, position=position_jitter(h=0))+
  xlim(13,90)+
  coord_trans(y = "sqrt")
## Warning: Removed 5178 rows containing missing values (geom_point).

Notice that I had to add the position_jitter(h=0), so that we do not get negative values. Otherwise, we will get an error. Think of the friend count conditioned on age now.

Exploring the relationship between friends initiated vs age.range

ggplot(aes(x=age, y=friendships_initiated),data=pf)+
  geom_point(alpha=1/10, position=position_jitter(h=1))+
   xlim(13,90)
## Warning: Removed 5181 rows containing missing values (geom_point).

Exploring conditional means and relationship between two or more variables.

First we install dplyr library, and group by age to create a new data frame.

#install.packages('dplyr')
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
age_groups<-group_by(pf,age)
pf.fc_by_age<-summarise(age_groups, friend_count_mean=mean(friend_count), friend_count_median=median(friend_count),n=n())

This yields a new data frame

pf.fc_by_age<-arrange(pf.fc_by_age,age)

This arranges the frame by age in decending order(?)

head(pf.fc_by_age)
## # A tibble: 6 x 4
##     age friend_count_mean friend_count_median     n
##   <int>             <dbl>               <dbl> <int>
## 1    13              165.                 74    484
## 2    14              251.                132   1925
## 3    15              348.                161   2618
## 4    16              352.                172.  3086
## 5    17              350.                156   3283
## 6    18              331.                162   5196

This yields the first few rows of the data frame.

AN ALTERNATE WAY TO GET THE SAME TABLE

Just use the original data set as follows

pf.fc_by_age<-pf %>%
  group_by(age) %>%
  summarise(friend_count_mean=mean(friend_count),
            friend_count_median=median(friend_count),
            n=n()) %>%
  arrange(age)

%>% is called the matching operator (?) chains functions onto data set the matching operator is part of the dplyr package

head(pf.fc_by_age)
## # A tibble: 6 x 4
##     age friend_count_mean friend_count_median     n
##   <int>             <dbl>               <dbl> <int>
## 1    13              165.                 74    484
## 2    14              251.                132   1925
## 3    15              348.                161   2618
## 4    16              352.                172.  3086
## 5    17              350.                156   3283
## 6    18              331.                162   5196

PROJECT: PLOT TABLE OF AVERAGES FC vs AGE

Recall the following plot

ggplot(aes(x=friend_count_mean,y=age),data=pf.fc_by_age)+geom_point()

Or with the line geometry

ggplot(aes(y=friend_count_mean,x=age),data=pf.fc_by_age)+geom_line()

USING GGPLOT TO SUMMARISE DATA First we compute the first scatter plot, then use the additional layer geom_line(summary, ....) to

ggplot(aes(x=age,y=friend_count),data=pf)+
  xlim(13,90)+
  geom_point(alpha=0.05,position=position_jitter(h=0),
             color='orange')+
  coord_trans(y='sqrt')+
  geom_line(stat="summary",fun.y=mean)+
  geom_line(stat="summary", fun.y=quantile, fun.args=list(probs=0.1),color='blue',linetype=2)+
  geom_line(stat="summary", fun.y=quantile, fun.args=list(probs=0.5),color='blue')+
  geom_line(stat="summary", fun.y=quantile, fun.args=list(probs=0.9),color='blue',linetype=2)
## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 5191 rows containing missing values (geom_point).

This plots show us that most of the younger users do NOT have more than 2000 friends. dashed lines (linetype=2) are the 10th, 50th, and 90th percentiles.

Reanalyzing, if we remove the xlim and coodr_trans layers, and replace them with the coord_cart() function we can analyze

ggplot(aes(x=age,y=friend_count),data=pf)+
  coord_cartesian(xlim=c(13,70), ylim=c(0,1000))+
  geom_point(alpha=0.05,position=position_jitter(h=0),
             color='orange')+
  geom_line(stat="summary",fun.y=mean,color='red')+
  geom_line(stat="summary", fun.y=quantile, fun.args=list(probs=0.1),color='blue',linetype=2)+
  geom_line(stat="summary", fun.y=quantile, fun.args=list(probs=0.5),color='blue',linetype=2)+
  geom_line(stat="summary", fun.y=quantile, fun.args=list(probs=0.9),color='blue',linetype=2)

INTERPRETATIONS OF THIS PLOT: It is very rare to have more than 800 friends

STUDYING CORRELATION

The cor.test function yields the correlation coefficent with a given confidence level (conf.level).

cor.test(x=pf$age,y=pf$friend_count, method="pearson", conf.level = 0.95)
## 
##  Pearson's product-moment correlation
## 
## data:  pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737
with(pf,cor.test(age,friend_count,method='pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737

Notice that the corellation coefficent is very very small in this particular pair of variables (pf$age,pf$friend_count).

Next we analyze the correlation coeff for subsets of the data so that we can only consider the range 13-70.

Consider the following code

with(subset(pf,age<68),cor.test(age,friend_count,method='pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -56.599, df = 89358, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1923557 -0.1796963
## sample estimates:
##        cor 
## -0.1860337

Note: cor.test uses pearson by default.

Measuring strength of increasing or decreasing, i.e. rank.

CREATING SCATTER POINTS

ggplot(data = pf,aes(x = www_likes_received, y = likes_received)) +
  geom_point(alpha=1/20)+
  xlim(0,quantile(pf$www_likes_received,0.95))+
  ylim(0,quantile(pf$likes_received,0.95))+
  geom_smooth(method = 'lm', color="red")
## Warning: Removed 6075 rows containing non-finite values (stat_smooth).
## Warning: Removed 6075 rows containing missing values (geom_point).

We can check to see how strong the relationship is by using the with and cor.test functions.

with(pf,cor.test(www_likes_received,likes_received,method='pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  www_likes_received and likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9473553 0.9486176
## sample estimates:
##       cor 
## 0.9479902
qqplot(x=pf$www_likes_received,y=pf$likes_received)

Notice that the distribution looks approx. normal, so the regression is valid. Also, we are appealing to the WLLN.

Date: 24.7.18

Here is a new package to work with

#install.packages('alr3')
library(alr3)
## Loading required package: car
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
data("Mitchell")

With this data set we will create a new scatter plot:

ggplot(data=Mitchell, aes(x=Month,y=Temp))+geom_point()

Notice this doesn’t look like the two variables are even correlated this observation is coroborated by the following computation.

with(Mitchell,cor.test(Month,Temp,method='pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  Month and Temp
## t = 0.81816, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08053637  0.19331562
## sample estimates:
##        cor 
## 0.05747063

We want to break up the months into 12 month intervals, which we can do as follows.

ggplot(data=Mitchell, aes(x=Month,y=Temp))+
  geom_point()+
  scale_x_continuous(breaks=seq(0,203,12))+
  geom_line()

Notice that if we stretch the plot, then the plot reveals a sinusoidal pattern. Q: Is there any way to add a curve layer? Yes, add a geom_line layer as above.

NOISE

Now we return to the original mean friend count data. Recall the following graph

p1<-ggplot(aes(y=friend_count_mean,x=age),data=subset(pf.fc_by_age,age<71))+
  geom_line()+
  geom_smooth()

head(pf.fc_by_age,10)
## # A tibble: 10 x 4
##      age friend_count_mean friend_count_median     n
##    <int>             <dbl>               <dbl> <int>
##  1    13              165.                 74    484
##  2    14              251.                132   1925
##  3    15              348.                161   2618
##  4    16              352.                172.  3086
##  5    17              350.                156   3283
##  6    18              331.                162   5196
##  7    19              334.                157   4391
##  8    20              283.                135   3769
##  9    21              236.                121   3671
## 10    22              211.                106   3032
pf.fc_by_age[17:19,]
## # A tibble: 3 x 4
##     age friend_count_mean friend_count_median     n
##   <int>             <dbl>               <dbl> <int>
## 1    29              121.                66    1936
## 2    30              115.                67.5  1716
## 3    31              118.                63    1694

By scaling the data appropriately, the conditional mean for age by months is given by

pf$age_with_months<-pf$age+(12-pf$dob_month)/12
head(pf$age_with_months)
## [1] 14.08333 14.08333 14.08333 14.00000 14.00000 14.00000
summary(pf$age_with_months)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.17   20.92   28.92   37.76   50.33  113.92

Once again, we use dplyr again to chain functions

pf.fc_by_age_months<-pf %>%
  group_by(age_with_months) %>%
  summarise(friend_count_mean=mean(friend_count),
            friend_count_median=median(friend_count),
            n=n()) %>%
  arrange(age_with_months)
head(pf.fc_by_age_months)
## # A tibble: 6 x 4
##   age_with_months friend_count_mean friend_count_median     n
##             <dbl>             <dbl>               <dbl> <int>
## 1            13.2              46.3                30.5     6
## 2            13.2             115.                 23.5    14
## 3            13.3             136.                 44      25
## 4            13.4             164.                 72      33
## 5            13.5             131.                 66      45
## 6            13.6             157.                 64      54

Now we have age with months means, so that we can determine the conditional means

Now I will create a plot of the average friend count with ages measured in months. I will also subset the data so that the we only look at the ages less than 71. This analysis will show us how noise and bin width appear in the data+plots.

p2<-ggplot(aes(y=friend_count_mean,x=age_with_months),data=subset(pf.fc_by_age_months, age_with_months<71))+
  geom_line()+
  geom_smooth()
library(gridExtra)
grid.arrange(p1,p2,ncol=1)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Notice that when grouped by months there is a lot of noise in the graph. We can filter this data even further. Consider the following filter

p3<-ggplot(aes(y=friend_count,x=round(age/5)*5),data=subset(pf, age<71))+
  geom_line(stat = 'summary',fun.y=mean)+
  geom_smooth()
grid.arrange(p1,p2,p3,ncol=1)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Bias-Variance trade off

How do these plots communicate the finding s of this particular analysis?

LESSON 6:

To begin this lesson, create a scatterplot of price vs x. using the ggplot syntax.

library(ggplot2)
data(diamonds)
ggplot(data=diamonds, aes(x=price,y=x))+
  geom_point(alpha=1/10,color='orange')

Now the correlation between price,x,y, and z respectively.

with(diamonds,cor.test(price,x,method='pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  price and x
## t = 440.16, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8825835 0.8862594
## sample estimates:
##       cor 
## 0.8844352
with(diamonds,cor.test(price,y,method='pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  price and y
## t = 401.14, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8632867 0.8675241
## sample estimates:
##       cor 
## 0.8654209
with(diamonds,cor.test(price,z,method='pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  price and z
## t = 393.6, df = 53938, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8590541 0.8634131
## sample estimates:
##       cor 
## 0.8612494

Next up, I’ll create a simple scatter plot of price vs depth.

Doing so yields

ggplot(data=diamonds,aes(x=price,y=depth))+geom_point()

Change the code to make the transparency of the points to be 1/100 of what they are now and mark the x-axis every 2 units. See the instructor notes for two hints.

ggplot(data = diamonds, aes(x = price, y = depth)) + 
  geom_point(alpha=1/100)+
  scale_y_continuous(breaks=seq(0,max(range(diamonds$depth)),2))

We see the range for depth of the diamonds is

range(diamonds$depth)
## [1] 43 79

Now we compute the correlation coeff. between depth vs. price

with(diamonds,cor.test(depth,price,method='pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  depth and price
## t = -2.473, df = 53938, p-value = 0.0134
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.019084756 -0.002208537
## sample estimates:
##        cor 
## -0.0106474

Create a scatterplot of price vs carat and omit the top 1% of price and carat values.

#ggplot(data=diamonds, aes(x=price,y=carat))+geom_point(alpha=1/100)
ggplot(data=subset(diamonds,price<quantile(price,probs=1/100)),aes(x=price,y=carat))+
 geom_point()

Create a scatterplot of price vs. volume (x * y * z).This is a very rough approximation for a diamond’s volume. Create a new variable for volume in the diamonds data frame.This will be useful in a later exercise.

Appendix Code

library(ggplot2)
library(RColorBrewer)
data(diamonds)
qplot(data=diamonds, x=carat,y=price,color=cut) + scale_color_brewer(palette = 'Accent')



getwd()
setwd('~/Documents/EDA_Data')
statesInfo<-read.csv('stateData.csv')
subset(statesInfo, state.region==1)
statesInfo[statesInfo$state.region==1,]
illiteracybracket<-statesInfo[statesInfo$illiteracy==0.5,]
highschoolgradbracket <- statesInfo[statesInfo$highSchoolGrad>50,]
qplot(data=statesInfo, x=state.region, y=illiteracy, color=statesInfo$highSchoolGrad) 
reddit<-read.csv("reddit.csv")
str(reddit)
table(reddit$employment.status)
summary(reddit$employment.status)
levels(reddit$age.range)
# notice 7 different ranges
qplot(reddit$age.range)
qplot(data=reddit, x=income.range)
reddit$age.range<-ordered(reddit$age.range, levels=c('Under 18', '18-24', '25-34', '35-44', '45-54', '55-64','65 or Above', 'NA'))
qplot(data=reddit, x=age.range)
reddit$age.range <- factor(reddit$age.range, levels=c('Under 18', '18-24', '25-34', '35-44', '45-54', '55-64','65 or Above', 'NA'), ordered = T)
reddit$income.range<-ordered(reddit$age.range, levels=c("Under $20,000", '$20,000 - $29,999', '$30,000 - $39,999', '$40,000 - $49,999', '$50,000 - $69,999', '$70,000 - $99,999', '$100,000 - $149,999', '$150,000 or more', 'NA'))
qplot(data=reddit, x=income.range)
getwd()
setwd('~/Documents/EDA_Data')
list.files()

pf<-read.csv('pseudo_facebook.tsv', sep='\t')
names(pf)
library(ggplot2)
#install.packages('ggthemes', dependencies = TRUE)
library(ggthemes)
names(pf)
ggplot(aes(x = dob_day), data = pf) +
  geom_histogram(binwidth = 1) +
  scale_x_continuous(breaks = 1:31)
ggplot(aes(x = dob_day), data = pf) +
  geom_histogram(binwidth = 1) +
  scale_x_continuous(breaks = 1:31) +
  facet_wrap(~dob_month,ncol=4)
qplot(x=friend_count, data=pf) 
qplot(x=friend_count, data=pf, xlim=c(0,1000)) 
qplot(x=friend_count, data=pf) + 
  scale_x_continuous(limit=c(0,1000))
qplot(x=friend_count, data=pf, binwidth=25) + 
  scale_x_continuous(limit=c(0,1000), breaks=seq(0,1000,50)) 
qplot(x=friend_count, data=pf, binwidth=25) + 
  scale_x_continuous(limit=c(0,1000), breaks=seq(0,1000,50)) +
  facet_grid(~gender)
ggplot(aes(x = friend_count), data = subset(pf, !is.na(gender))) +
  geom_histogram() +
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
  facet_wrap(~gender)
table(pf$gender)
by(pf$friend_count,pf$gender,summary)
ggplot(aes(x = tenure), data = pf) +
  geom_histogram(binwidth = 30, color = 'black', fill = '#099DD9')
qplot(x=tenure/365, data=pf, binwidth=1/12, color = I('black'), fill = I('#099DD9'), ylab='tenure', xlab='years')+ 
  scale_x_continuous(seq(1,7,1), limits = c(0,7))
qplot(x=age, data=pf, binwidth=1, color = I('black'), fill = I('#099DD9')) +
  scale_x_continuous(seq(5,110,5), limits = c(5,110))
qplot(x=friend_count, data=pf)
summary(pf$friend_count)
summary(log10(pf$friend_count+1))
summary(sqrt(pf$friend_count))      
#install.packages('gridExtra')
library(gridExtra)
p1<-qplot(x=friend_count, data=pf)
p2<-qplot(x=log10(friend_count+1), data=pf)
p3<-qplot(x=sqrt(friend_count),data=pf)
grid.arrange(p1,p2,p3)
p1<-ggplot(aes(x=friend_count), data=pf) + geom_histogram()
p2<-p1+scale_x_log10()
p3<-p1+scale_x_sqrt()
grid.arrange(p1,p2,p3)
logScale<-qplot(x=log10(friend_count+1), data=pf)
countScale<-ggplot(aes(x=friend_count), data=pf) + geom_histogram() +scale_x_log10()
grid.arrange(logScale,countScale, ncol = 2)
qplot(x=friend_count, data= subset(pf,!is.na(gender)), binwidth=10, geom='freqpoly', color=gender) +
  scale_x_continuous(lim=c(0,1000), breaks=seq(0,1000,50))
ggplot(aes(x = friend_count, y = ..count../sum(..count..)),
       data = subset(pf, !is.na(gender))) +
  geom_freqpoly(aes(color = gender), binwidth=10) +
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
  xlab('Friend Count') +
  ylab('Proportion of users with that friend count')
qplot(x=friend_count, y=..count../sum(..count..),
      data= subset(pf,!is.na(gender)),
      xlab='Friend Count',
      ylab='Proportion of users w/ friend count',
      binwidth=10, geom='freqpoly', color=gender) +
  scale_x_continuous(lim=c(0,1000), breaks=seq(0,1000,50))
qplot(x=www_likes,
      data= subset(pf,!is.na(gender)),
      geom='freqpoly', color=gender) +
  scale_x_continuous()+
  scale_x_log10()
by(pf$www_likes,pf$gender, sum)
qplot(x=friend_count, data= subset(pf,!is.na(gender)), binwidth=10, geom='freqpoly', color=gender) +
  scale_x_continuous(lim=c(0,1000), breaks=seq(0,1000,50))
qplot(geom='boxplot', data=subset(pf,!is.na(gender)),
      x=gender,
      y=log10(friend_count),
      ylim=c(0,1000))
qplot(geom='boxplot', data=subset(pf,!is.na(gender)),
      x=gender,
      y=friend_count)+
  scale_y_continuous(limits=c(0,1000))
qplot(geom='boxplot', data=subset(pf,!is.na(gender)),
      x=gender,
      y=friend_count)+
  coord_cartesian(ylim=c(0,1000))
qplot(geom='boxplot', data=subset(pf,!is.na(gender)),
      x=gender,
      y=friend_count)+
  coord_cartesian(ylim=c(0,250))
by(pf$friend_count,pf$gender,summary)
qplot(x=gender, y=friendships_initiated,data=subset(pf,!is.na(gender)),geom='boxplot')+
  coord_cartesian(ylim=c(0,150))
by(pf$friendships_initiated,pf$gender,summary)
summary(pf$mobile_likes)
summary(pf$mobile_likes>0)
pf$mobile_check_in<-NA
pf$mobile_check_in<-ifelse(pf$mobile_likes>0,1,0)
pf$mobile_check_in<-factor(pf$mobile_check_in)
summary(pf$mobile_check_in)
sum(pf$mobile_check_in==1)/length(pf$mobile_check_in)
library(ggplot2)
data(diamonds)
summary(diamonds)
summary(diamonds$price)
sum(ifelse(diamonds$price>=15000,1,0))
ggplot(aes(x = price), data = diamonds) +
  geom_histogram(binwidth = 20, color = 'black', fill = '#099DD9')+
  scale_x_continuous(limits=c(0,1500))
ggplot(aes(x = price),data = diamonds) +
  geom_histogram() +
  scale_x_continuous() +
  facet_wrap(~cut, scales="free_y")

by(diamonds$price,diamonds$cut,summary)
summary(diamonds)

ggplot(aes(x = log10(price/carat)),data = diamonds) +
  geom_histogram() +
  scale_x_continuous() +
  facet_wrap(~cut, scales="free_y")

qplot(geom='boxplot', data=diamonds,
      x=color,
      y=price/carat) +
  coord_cartesian( ylim=c(0,7000))


by(diamonds$price,diamonds$color,IQR)
?diamonds


qplot(x=price, data=diamonds, binwidth=40, geom='freqpoly', color=color)

qplot(data=diamonds,x=carat, geom = 'freqpoly',binwidth=0.1, color=carat)+
  scale_x_continuous(breaks = seq(0,3,0.1),lim= c(0,3))+
  scale_y_continuous(breaks = seq(2000,10000,1000))
setwd('~/Documents/EDA_Data')
library(tidyr)
gdp <- read.csv("GDP.csv", header=T, check.names = F)

gdp.T <- gather(data=gdp, key='Year', value='GDP', '1960':'2011',convert = TRUE)
range(gdp.T$Year)


hiv <- read.csv("HIV.csv", header=T, check.names = F)
hiv.T <- gather(data=hiv, key='Year', value='HIV_prev', '1979':'2011', convert = TRUE)
hiv.T$HIV_prev <- as.numeric(hiv.T$HIV_prev)

summary(hiv.T)

gdp.HIV <- merge(gdp.T, hiv.T)
str(gdp.HIV)

summary((is.na(gdp.HIV$GDP)) / nrow(gdp.HIV) * 100)
summary(gdp.HIV$GDP)
summary(gdp.HIV$HIV_prev)
qplot(y=gdp.HIV$`Income per person (fixed 2000 US$)`,x=gdp.HIV$GDP, data=gdp.HIV, xlim = c(0,4500))
library(ggplot2)
pf<-read.csv('pseudo_facebook.tsv', sep='\t')
qplot(x=age,y=friend_count,data=pf)
qplot(age,friend_count,data=pf)
ggplot(aes(x=age, y=friend_count),data=pf)+geom_point()+xlim(13,90)
summary(pf$age)
ggplot(aes(x=age, y=friend_count),data=pf)+
  geom_point(alpha=1/20)+
  xlim(13,90)
ggplot(aes(x=age, y=friend_count),data=pf)+
  geom_jitter(alpha=1/20)+
  xlim(13,90)
ggplot(aes(x=age, y=friend_count),data=pf)+
  geom_point(alpha=1/20, position=position_jitter(h=0))+
  xlim(13,90)+
  coord_trans(y = "sqrt")
ggplot(aes(x=age, y=friendships_initiated),data=pf)+
  geom_point(alpha=1/10, position=position_jitter(h=1))+
   xlim(13,90)
#install.packages('dplyr')
library(dplyr)
age_groups<-group_by(pf,age)
pf.fc_by_age<-summarise(age_groups, friend_count_mean=mean(friend_count), friend_count_median=median(friend_count),n=n())
pf.fc_by_age<-arrange(pf.fc_by_age,age)
head(pf.fc_by_age)
pf.fc_by_age<-pf %>%
  group_by(age) %>%
  summarise(friend_count_mean=mean(friend_count),
            friend_count_median=median(friend_count),
            n=n()) %>%
  arrange(age)
head(pf.fc_by_age)
ggplot(aes(x=friend_count_mean,y=age),data=pf.fc_by_age)+geom_point()
ggplot(aes(y=friend_count_mean,x=age),data=pf.fc_by_age)+geom_line()
ggplot(aes(x=age,y=friend_count),data=pf)+
  xlim(13,90)+
  geom_point(alpha=0.05,position=position_jitter(h=0),
             color='orange')+
  coord_trans(y='sqrt')+
  geom_line(stat="summary",fun.y=mean)+
  geom_line(stat="summary", fun.y=quantile, fun.args=list(probs=0.1),color='blue',linetype=2)+
  geom_line(stat="summary", fun.y=quantile, fun.args=list(probs=0.5),color='blue')+
  geom_line(stat="summary", fun.y=quantile, fun.args=list(probs=0.9),color='blue',linetype=2)
ggplot(aes(x=age,y=friend_count),data=pf)+
  coord_cartesian(xlim=c(13,70), ylim=c(0,1000))+
  geom_point(alpha=0.05,position=position_jitter(h=0),
             color='orange')+
  geom_line(stat="summary",fun.y=mean,color='red')+
  geom_line(stat="summary", fun.y=quantile, fun.args=list(probs=0.1),color='blue',linetype=2)+
  geom_line(stat="summary", fun.y=quantile, fun.args=list(probs=0.5),color='blue',linetype=2)+
  geom_line(stat="summary", fun.y=quantile, fun.args=list(probs=0.9),color='blue',linetype=2)
cor.test(x=pf$age,y=pf$friend_count, method="pearson", conf.level = 0.95)
with(pf,cor.test(age,friend_count,method='pearson'))
with(subset(pf,age<68),cor.test(age,friend_count,method='pearson'))
ggplot(data = pf,aes(x = www_likes_received, y = likes_received)) +
  geom_point(alpha=1/20)+
  xlim(0,quantile(pf$www_likes_received,0.95))+
  ylim(0,quantile(pf$likes_received,0.95))+
  geom_smooth(method = 'lm', color="red")
with(pf,cor.test(www_likes_received,likes_received,method='pearson'))
qqplot(x=pf$www_likes_received,y=pf$likes_received)
#install.packages('alr3')
library(alr3)
data("Mitchell")
ggplot(data=Mitchell, aes(x=Month,y=Temp))+geom_point()
with(Mitchell,cor.test(Month,Temp,method='pearson'))
ggplot(data=Mitchell, aes(x=Month,y=Temp))+
  geom_point()+
  scale_x_continuous(breaks=seq(0,203,12))+
  geom_line()
p1<-ggplot(aes(y=friend_count_mean,x=age),data=subset(pf.fc_by_age,age<71))+
  geom_line()+
  geom_smooth()

head(pf.fc_by_age,10)
pf.fc_by_age[17:19,]
pf$age_with_months<-pf$age+(12-pf$dob_month)/12
head(pf$age_with_months)
summary(pf$age_with_months)
pf.fc_by_age_months<-pf %>%
  group_by(age_with_months) %>%
  summarise(friend_count_mean=mean(friend_count),
            friend_count_median=median(friend_count),
            n=n()) %>%
  arrange(age_with_months)
head(pf.fc_by_age_months)
p2<-ggplot(aes(y=friend_count_mean,x=age_with_months),data=subset(pf.fc_by_age_months, age_with_months<71))+
  geom_line()+
  geom_smooth()
library(gridExtra)
grid.arrange(p1,p2,ncol=1)
p3<-ggplot(aes(y=friend_count,x=round(age/5)*5),data=subset(pf, age<71))+
  geom_line(stat = 'summary',fun.y=mean)+
  geom_smooth()
grid.arrange(p1,p2,p3,ncol=1)
library(ggplot2)
data(diamonds)
ggplot(data=diamonds, aes(x=price,y=x))+
  geom_point(alpha=1/10,color='orange')

with(diamonds,cor.test(price,x,method='pearson'))
with(diamonds,cor.test(price,y,method='pearson'))
with(diamonds,cor.test(price,z,method='pearson'))

ggplot(data=diamonds,aes(x=price,y=depth))+geom_point()
ggplot(data = diamonds, aes(x = price, y = depth)) + 
  geom_point(alpha=1/100)+
  scale_y_continuous(breaks=seq(0,max(range(diamonds$depth)),2))

range(diamonds$depth)
with(diamonds,cor.test(depth,price,method='pearson'))

#ggplot(data=diamonds, aes(x=price,y=carat))+geom_point(alpha=1/100)
ggplot(data=subset(diamonds,price<quantile(price,probs=1/100)),aes(x=price,y=carat))+
 geom_point()